package org.wyl.helloworld;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.TextNode;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;

import java.util.ArrayList;
import java.util.List;
import java.util.function.Function;
import java.util.stream.Collectors;

/**
 * 爬取页面的逻辑
 */
public class HelloworldProcessor implements us.codecraft.webmagic.processor.PageProcessor {

    @Override
    public Site getSite() {
        return Site.me()
                .setDomain("s.weibo.com")
                .addCookie("SINAGLOBAL", "2141853883966.1582.1581588656429;")
                .addCookie("UOR", "gl.ali213.net,widget.weibo.com,www.baidu.com;")
                .addCookie("SUBP", "0033WrSXqPxfM725Ws9jqgMF55529P9D9WFeiDSO70_IypWyNidTPY7E5JpX5KMhUgL.Fo-XeoqcShq0S052dJLoIE5LxK-L1K5LBoBLxK-LB--L1hMLxKBLB.2LB--XS0n7Sntt;")
                .addCookie("SCF", "AqP_BOu8158gEGFyPwEeZHYHNUwjPZ0zIrzaQCeHjPjzM1ZLu_zoqzyn5IuA5OU9iEIM3ol_cu-bzJa8iEGuEDk.;")
                .addCookie("SUB", "_2A25P2nwfDeThGeNK6VQX9CjPzDyIHXVsrurXrDV8PUNbmtANLUXbkW9NSVWG3AheKbBHU9QTNjv2VNYiI8rjqisS;")
                .addCookie("_s_tentry", "login.sina.com.cn;")
                .addCookie("Apache", "9730355459291.012.1658719323789;")
                .addCookie("ULV", "1658719323794:32:1:1:9730355459291.012.1658719323789:1656321250415")
                .setRetryTimes(3)
                .setSleepTime(100);
    }

    @Override
    public void process(Page page) {
        List<String> alist = page.getHtml().css("#pl_top_realtimehot > table > tbody .td-02").all();
        List<String> hotKeys = alist.stream().map(a -> Jsoup.parse(a).select("a").text()).collect(Collectors.toList());
        // 使用该方法可以将抓取的内容传递给后面的组件
        page.putField("hotKeys", hotKeys);
    }
}